In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
In [2]:
df = pd.read_csv("risk_data.csv") # Load data
In [3]:
df.shape
Out[3]:
(5042, 122)
In [4]:
df["DAYS_EMPLOYED"].describe()
Out[4]:
count     5042.000000
mean     -2395.756446
std       2397.824075
min     -16121.000000
25%      -3184.750000
50%      -1649.000000
75%       -746.000000
max         -9.000000
Name: DAYS_EMPLOYED, dtype: float64
In [5]:
365243 /365
Out[5]:
1000.6657534246575

check Missing Values¶

In [6]:
df.isna().sum()
Out[6]:
SK_ID_CURR                      0
TARGET                          0
NAME_CONTRACT_TYPE              0
CODE_GENDER                     0
FLAG_OWN_CAR                    0
                             ... 
AMT_REQ_CREDIT_BUREAU_DAY     666
AMT_REQ_CREDIT_BUREAU_WEEK    666
AMT_REQ_CREDIT_BUREAU_MON     666
AMT_REQ_CREDIT_BUREAU_QRT     666
AMT_REQ_CREDIT_BUREAU_YEAR    666
Length: 122, dtype: int64
In [7]:
# Drop Unnecessary data
drop = ["SK_ID_CURR", "OWN_CAR_AGE", "FLAG_EMP_PHONE", "FLAG_WORK_PHONE", 
 "FLAG_CONT_MOBILE", "FLAG_PHONE", "EXT_SOURCE_1", "EXT_SOURCE_2", 
 "EXT_SOURCE_3", "BASEMENTAREA_AVG", "YEARS_BEGINEXPLUATATION_AVG",
 "YEARS_BUILD_AVG","COMMONAREA_AVG", "ELEVATORS_AVG", "ENTRANCES_AVG", 
 "FLOORSMAX_AVG", "FLOORSMIN_AVG", "LANDAREA_AVG","LIVINGAPARTMENTS_AVG",
 "LIVINGAREA_AVG", "NONLIVINGAPARTMENTS_AVG","NONLIVINGAREA_AVG","APARTMENTS_MODE",
 "BASEMENTAREA_MODE",'YEARS_BEGINEXPLUATATION_MODE',"YEARS_BUILD_MODE","COMMONAREA_MODE",
 "ELEVATORS_MODE","ENTRANCES_MODE","FLOORSMAX_MODE","FLOORSMIN_MODE","LANDAREA_MODE",
 'LIVINGAPARTMENTS_MODE', "LIVINGAREA_MODE","NONLIVINGAPARTMENTS_MODE","NONLIVINGAREA_MODE",
 "APARTMENTS_MEDI","BASEMENTAREA_MEDI","YEARS_BEGINEXPLUATATION_MEDI","YEARS_BUILD_MEDI",
 "COMMONAREA_MEDI","ELEVATORS_MEDI","ENTRANCES_MEDI","FLOORSMAX_MEDI","FLOORSMIN_MEDI",
"LANDAREA_MEDI","LIVINGAPARTMENTS_MEDI","LIVINGAREA_MEDI","NONLIVINGAPARTMENTS_MEDI","NONLIVINGAREA_MEDI",
 "TOTALAREA_MODE","OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE","OBS_60_CNT_SOCIAL_CIRCLE",
 "DEF_60_CNT_SOCIAL_CIRCLE","DAYS_LAST_PHONE_CHANGE","FLAG_DOCUMENT_2","FLAG_DOCUMENT_3","FLAG_DOCUMENT_4",
 "FLAG_DOCUMENT_5","FLAG_DOCUMENT_6","FLAG_DOCUMENT_7","FLAG_DOCUMENT_8","FLAG_DOCUMENT_9","FLAG_DOCUMENT_10",
 "FLAG_DOCUMENT_11","FLAG_DOCUMENT_12","FLAG_DOCUMENT_13","FLAG_DOCUMENT_14","FLAG_DOCUMENT_15",
 "FLAG_DOCUMENT_16","FLAG_DOCUMENT_17","FLAG_DOCUMENT_18","FLAG_DOCUMENT_19","FLAG_DOCUMENT_20","FLAG_DOCUMENT_21",
 "APARTMENTS_AVG", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE", "OCCUPATION_TYPE", 
 "FLAG_MOBIL", "FLAG_EMAIL"]

df.drop(drop, axis=1, inplace=True)
In [8]:
df.isna().sum()
df.dropna(inplace=True)
In [9]:
df.shape
Out[9]:
(4363, 38)
In [10]:
df.columns
Out[10]:
Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
       'ORGANIZATION_TYPE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')

Numeric variables¶

In [11]:
df.describe()
df.drop(["AMT_REQ_CREDIT_BUREAU_DAY","AMT_REQ_CREDIT_BUREAU_WEEK",
         "AMT_REQ_CREDIT_BUREAU_HOUR","AMT_REQ_CREDIT_BUREAU_YEAR",
         "AMT_REQ_CREDIT_BUREAU_QRT","AMT_REQ_CREDIT_BUREAU_MON"],axis=1, inplace=True)
In [12]:
drop = ["REG_REGION_NOT_LIVE_REGION",
        "REG_REGION_NOT_WORK_REGION",
        "LIVE_REGION_NOT_WORK_REGION",
        "REG_CITY_NOT_LIVE_CITY",
        "REG_CITY_NOT_WORK_CITY",
        "LIVE_CITY_NOT_WORK_CITY"]
df.drop(drop, axis=1, inplace=True)
In [13]:
df.describe()
Out[13]:
TARGET CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START
count 4363.000000 4363.000000 4.363000e+03 4.363000e+03 4363.000000 4.363000e+03 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000
mean 0.082970 0.471923 1.797927e+05 6.317434e+05 28244.090649 5.668660e+05 0.021071 -15028.916571 -2437.450607 -4728.646115 -2875.125602 2.224158 2.039881 2.021774 12.285354
std 0.275869 0.717776 1.006784e+05 4.168209e+05 14728.222054 3.821942e+05 0.013931 3646.057455 2409.073236 3300.520654 1491.215163 0.898158 0.516811 0.512327 3.302137
min 0.000000 0.000000 3.510000e+04 4.500000e+04 2673.000000 4.500000e+04 0.000533 -25126.000000 -15860.000000 -18294.000000 -6207.000000 1.000000 1.000000 1.000000 0.000000
25% 0.000000 0.000000 1.125000e+05 2.844000e+05 17286.750000 2.475000e+05 0.010006 -17880.500000 -3267.000000 -7128.000000 -4207.500000 2.000000 2.000000 2.000000 10.000000
50% 0.000000 0.000000 1.575000e+05 5.400000e+05 26284.500000 4.545000e+05 0.018850 -14864.000000 -1686.000000 -4298.000000 -2995.000000 2.000000 2.000000 2.000000 12.000000
75% 0.000000 1.000000 2.250000e+05 8.550000e+05 35964.000000 7.290000e+05 0.028663 -12053.500000 -766.000000 -1944.000000 -1629.000000 3.000000 2.000000 2.000000 15.000000
max 1.000000 4.000000 1.350000e+06 2.700000e+06 225000.000000 2.700000e+06 0.072508 -7721.000000 -9.000000 -3.000000 -1.000000 6.000000 3.000000 3.000000 23.000000
In [14]:
df.dtypes
Out[14]:
TARGET                           int64
NAME_CONTRACT_TYPE              object
CODE_GENDER                     object
FLAG_OWN_CAR                    object
FLAG_OWN_REALTY                 object
CNT_CHILDREN                     int64
AMT_INCOME_TOTAL               float64
AMT_CREDIT                     float64
AMT_ANNUITY                    float64
AMT_GOODS_PRICE                float64
NAME_TYPE_SUITE                 object
NAME_INCOME_TYPE                object
NAME_EDUCATION_TYPE             object
NAME_FAMILY_STATUS              object
NAME_HOUSING_TYPE               object
REGION_POPULATION_RELATIVE     float64
DAYS_BIRTH                       int64
DAYS_EMPLOYED                    int64
DAYS_REGISTRATION              float64
DAYS_ID_PUBLISH                  int64
CNT_FAM_MEMBERS                float64
REGION_RATING_CLIENT             int64
REGION_RATING_CLIENT_W_CITY      int64
WEEKDAY_APPR_PROCESS_START      object
HOUR_APPR_PROCESS_START          int64
ORGANIZATION_TYPE               object
dtype: object
In [15]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(df["AMT_CREDIT"])
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/2776091565.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  ax = sns.distplot(df["AMT_CREDIT"])
In [16]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot(np.log(df["AMT_CREDIT"]))
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/854655729.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  ax = sns.distplot(np.log(df["AMT_CREDIT"]))
In [17]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_INCOME_TOTAL")
ax = sns.distplot(df["AMT_INCOME_TOTAL"].dropna())
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/873008540.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  ax = sns.distplot(df["AMT_INCOME_TOTAL"].dropna())
In [18]:
plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_GOODS_PRICE")
ax = sns.distplot(df["AMT_GOODS_PRICE"].dropna())
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/3108533166.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  ax = sns.distplot(df["AMT_GOODS_PRICE"].dropna())

Some of variables such as Region_rating_client_W_CITY has very bad distribution.

In [19]:
df.describe()
Out[19]:
TARGET CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START
count 4363.000000 4363.000000 4.363000e+03 4.363000e+03 4363.000000 4.363000e+03 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000
mean 0.082970 0.471923 1.797927e+05 6.317434e+05 28244.090649 5.668660e+05 0.021071 -15028.916571 -2437.450607 -4728.646115 -2875.125602 2.224158 2.039881 2.021774 12.285354
std 0.275869 0.717776 1.006784e+05 4.168209e+05 14728.222054 3.821942e+05 0.013931 3646.057455 2409.073236 3300.520654 1491.215163 0.898158 0.516811 0.512327 3.302137
min 0.000000 0.000000 3.510000e+04 4.500000e+04 2673.000000 4.500000e+04 0.000533 -25126.000000 -15860.000000 -18294.000000 -6207.000000 1.000000 1.000000 1.000000 0.000000
25% 0.000000 0.000000 1.125000e+05 2.844000e+05 17286.750000 2.475000e+05 0.010006 -17880.500000 -3267.000000 -7128.000000 -4207.500000 2.000000 2.000000 2.000000 10.000000
50% 0.000000 0.000000 1.575000e+05 5.400000e+05 26284.500000 4.545000e+05 0.018850 -14864.000000 -1686.000000 -4298.000000 -2995.000000 2.000000 2.000000 2.000000 12.000000
75% 0.000000 1.000000 2.250000e+05 8.550000e+05 35964.000000 7.290000e+05 0.028663 -12053.500000 -766.000000 -1944.000000 -1629.000000 3.000000 2.000000 2.000000 15.000000
max 1.000000 4.000000 1.350000e+06 2.700000e+06 225000.000000 2.700000e+06 0.072508 -7721.000000 -9.000000 -3.000000 -1.000000 6.000000 3.000000 3.000000 23.000000
In [20]:
import plotly.express as px
num_vars = df.select_dtypes(include=['float', 'int']).columns.tolist()

# Plot each numerical variable
for var in num_vars:
    fig = px.box(df, x=var)
    fig.show()
In [21]:
import plotly.graph_objects as go

numeric_cols = df.select_dtypes(include=['float64', 'int64'])
traces = []
for col in numeric_cols.columns:
    trace = go.Box(
        y=numeric_cols[col],
        name=col,
        boxpoints='outliers'
    )
    traces.append(trace)

# Create the layout and plot the figure
layout = go.Layout(
    title='Boxplot of Numeric Variables',
    xaxis=dict(title='Value')
)
fig = go.Figure(data=traces, layout=layout)
fig.show()
In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_cols = scaler.fit_transform(numeric_cols)
scaled_df = pd.DataFrame(scaled_cols, columns=numeric_cols.columns)

# Create a list of traces for each column
traces = []
for col in scaled_df.columns:
    trace = go.Box(
        y=scaled_df[col],
        name=col,
        boxpoints='outliers'
    )
    traces.append(trace)

# Create the layout and plot the figure
layout = go.Layout(
    title='Boxplot of Scaled Numeric Variables',
    xaxis=dict(title='Scaled Value')
)
fig = go.Figure(data=traces, layout=layout)
fig.show()

Categorical EDA&Trans¶

Target¶

In [23]:
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
def bar_hor(df, col, title, color, w=None, h=None, lm=0, limit=100, return_trace=False, rev=False, xlb = False):
    cnt_srs = df[col].value_counts()
    yy = cnt_srs.head(limit).index[::-1] 
    xx = cnt_srs.head(limit).values[::-1] 
    if rev:
        yy = cnt_srs.tail(limit).index[::-1] 
        xx = cnt_srs.tail(limit).values[::-1] 
    if xlb:
        trace = go.Bar(y=xlb, x=xx, orientation = 'h', marker=dict(color=color))
    else:
        trace = go.Bar(y=yy, x=xx, orientation = 'h', marker=dict(color=color))
    if return_trace:
        return trace 
    layout = dict(title=title, margin=dict(l=lm), width=w, height=h)
    data = [trace]
    fig = go.Figure(data=data, layout=layout)
    return fig
    
bar_hor(df, "TARGET", "Distribution of Target Variable" ,
        ["#96D38C", '#FEBFB3'], h=350, w=600, lm=200, xlb = ['Target : 1','Target : 0'])

gender¶

In [24]:
def gp(col, title):
    df1 = df[df["TARGET"] == 1]
    df0 = df[df["TARGET"] == 0]
    a1 = df1[col].value_counts()
    b1 = df0[col].value_counts()
    
    total = dict(df[col].value_counts())
    x0 = a1.index
    x1 = b1.index
    
    y0 = [float(x)*100 / total[x0[i]] for i,x in enumerate(a1.values)]
    y1 = [float(x)*100 / total[x1[i]] for i,x in enumerate(b1.values)]

    trace1 = go.Bar(x=a1.index, y=y0, name='Target : 1', marker=dict(color="#96D38C"))
    trace2 = go.Bar(x=b1.index, y=y1, name='Target : 0', marker=dict(color="#FEBFB3"))
    return trace1, trace2 
In [25]:
tr0 = bar_hor(df, "CODE_GENDER", "Distribution of CODE_GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1, tr2 = gp('CODE_GENDER', 'Distribution of Target with Applicant Gender')

fig = tools.make_subplots(rows=1, cols=3, print_grid=False, subplot_titles = ["Gender Distribution" , "Gender, Target=1" ,"Gender, Target=0"])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig.append_trace(tr2, 1, 3);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=50));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

Number of Females are much more than number of Men in the data

SUITE and Income¶

In [26]:
import plotly.graph_objects as go
temp = df["NAME_TYPE_SUITE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
    x = temp.index,
    y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
    title = "Who accompanied client when applying for the  application in % ",
    xaxis=dict(
        title='Name of type of the Suite',
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Count of Name of type of the Suite in %',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
)
)
fig = go.Figure(data=data, layout=layout)
fig
In [27]:
import plotly.graph_objects as go
temp = df["NAME_INCOME_TYPE"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
    x = temp.index,
    y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
    title = "Who accompanied client when applying for the  application in % ",
    xaxis=dict(
        title='Name of type of the Suite',
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Count of Name of type of the Suite in %',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
)
)
fig = go.Figure(data=data, layout=layout)
fig
In [28]:
df["NAME_INCOME_TYPE"] = df["NAME_INCOME_TYPE"].map({"Commercial associate":"Businessman", 
                                                             "Student":"Unemployed", "Maternity leave":"Unemployed",
                                                            "State servant":"State servent", "Working":"Working",
                                                            "Unemployed":"Unemployed","Pensioner":"Pensioner"})
In [29]:
df["NAME_TYPE_SUITE"] = df["NAME_TYPE_SUITE"].map({"Spouse, partner": "Others",
                                                                 "Children":"Others","Other_B":"Others",
                                                                 "Other_A":"Others","Group of people":"Others",
                                                                 "Unaccompanied":"Unaccompanied","Family":"Family"})
In [30]:
df = df[df["NAME_INCOME_TYPE"] != "Unemployed"]
In [31]:
tr0 = bar_hor(df, "NAME_TYPE_SUITE", "Distribution of CODE_GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)
tr1 = bar_hor(df, "NAME_INCOME_TYPE", "Distribution of CODE_GENDER Variable" ,"#f975ae", w=700, lm=100, return_trace= True)

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Suite Type' , 'Applicants Income Type'])
fig.append_trace(tr0, 1, 1);
fig.append_trace(tr1, 1, 2);
fig['layout'].update(height=400, showlegend=False, margin=dict(l=100));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

In [32]:
tr1, tr2 = gp('NAME_TYPE_SUITE', 'Applicants Type Suites which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles = ["Applicants Type Suites distribution when Target = 1", "Applicants Type Suites distribution when Target = 0"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);


tr1, tr2 = gp('NAME_INCOME_TYPE', 'Applicants Income Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles = ["Applicants Income Types when Target = 1", "Applicants Income Type When Target = 0"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=120));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

Car¶

In [33]:
temp = df["FLAG_OWN_CAR"].value_counts()
#print("Total number of states : ",len(temp))
trace = go.Bar(
    x = temp.index,
    y = (temp / temp.sum())*100,
)
data = [trace]
layout = go.Layout(
    title = "Who accompanied client when applying for the  application in % ",
    xaxis=dict(
        title='Name of type of the Suite',
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='Count of Name of type of the Suite in %',
        titlefont=dict(
            size=16,
            color='rgb(107, 107, 107)'
        ),
        tickfont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
)
)
fig = go.Figure(data=data, layout=layout)
fig

EDU¶

In [34]:
tr1 = bar_hor(df, "NAME_EDUCATION_TYPE", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace= True)
tr2 = bar_hor(df, "NAME_HOUSING_TYPE", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace = True)

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['Applicants Education Type', 'Applicants Housing Type' ])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=400,showlegend=False, margin=dict(l=100));
iplot(fig);


tr1, tr2 = gp('NAME_EDUCATION_TYPE', 'Applicants Income Types which repayed the loan')
tr3, tr4 = gp('NAME_HOUSING_TYPE', 'Applicants Income Types which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles = ["Applicants Education Types, Target=1", "Applicants Housing Type, Target=1"])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr3, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=30));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

In [35]:
tr1 = bar_hor(df, "NAME_FAMILY_STATUS", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace= True)
tr2 = bar_hor(df, "FLAG_OWN_CAR", "Distribution of " ,"#f975ae", w=700, lm=100, return_trace = True)

fig = tools.make_subplots(rows=1, cols=2, print_grid=False, subplot_titles = ['NAME_FAMILY_STATUS', 'FLAG OWN CAR' ])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr2, 1, 2);
fig['layout'].update(height=400,showlegend=False, margin=dict(l=100));
iplot(fig);


tr1, tr2 = gp('NAME_FAMILY_STATUS', 'FLAG_OWN_CAR which repayed the loan')
tr3, tr4 = gp('FLAG_OWN_CAR', 'FLAG_OWN_CAR which repayed the loan')
fig = tools.make_subplots(rows=1, cols=2, print_grid=False, 
                          subplot_titles = ['NAME_FAMILY_STATUS, target = 1', 'FLAG OWN CAR. target = 1'])
fig.append_trace(tr1, 1, 1);
fig.append_trace(tr3, 1, 2);
fig['layout'].update(height=350, showlegend=False, margin=dict(l=30));
iplot(fig);
/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

/opt/homebrew/Caskroom/miniforge/base/envs/ml/lib/python3.9/site-packages/plotly/tools.py:460: DeprecationWarning:

plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead

standardized Numerical¶

In [36]:
df.dtypes
Out[36]:
TARGET                           int64
NAME_CONTRACT_TYPE              object
CODE_GENDER                     object
FLAG_OWN_CAR                    object
FLAG_OWN_REALTY                 object
CNT_CHILDREN                     int64
AMT_INCOME_TOTAL               float64
AMT_CREDIT                     float64
AMT_ANNUITY                    float64
AMT_GOODS_PRICE                float64
NAME_TYPE_SUITE                 object
NAME_INCOME_TYPE                object
NAME_EDUCATION_TYPE             object
NAME_FAMILY_STATUS              object
NAME_HOUSING_TYPE               object
REGION_POPULATION_RELATIVE     float64
DAYS_BIRTH                       int64
DAYS_EMPLOYED                    int64
DAYS_REGISTRATION              float64
DAYS_ID_PUBLISH                  int64
CNT_FAM_MEMBERS                float64
REGION_RATING_CLIENT             int64
REGION_RATING_CLIENT_W_CITY      int64
WEEKDAY_APPR_PROCESS_START      object
HOUR_APPR_PROCESS_START          int64
ORGANIZATION_TYPE               object
dtype: object
In [37]:
df["TARGET"] = df["TARGET"].astype(str) 
In [38]:
from sklearn.preprocessing import MinMaxScaler
# select only numerical columns
num_cols = df.select_dtypes(include=['float',"int"]).columns
# create scaler object
scaler = MinMaxScaler()
# fit and transform data using scaler
scaled_data = scaler.fit_transform(df[num_cols])
# assign back to original column names
df.loc[:, num_cols] = scaled_data
In [39]:
df.columns
Out[39]:
Index(['TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
       'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
       'ORGANIZATION_TYPE'],
      dtype='object')
In [40]:
df.describe()
Out[40]:
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY HOUR_APPR_PROCESS_START
count 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000 4363.000000
mean 0.117981 0.110041 0.220996 0.115016 0.196560 0.285346 0.580125 0.846795 0.741641 0.536880 0.244832 0.519940 0.510887 0.534146
std 0.179444 0.076567 0.156995 0.066246 0.143953 0.193560 0.209483 0.151982 0.180445 0.240286 0.179632 0.258405 0.256163 0.143571
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.058864 0.090169 0.065731 0.076271 0.131615 0.416288 0.794461 0.610464 0.322188 0.200000 0.500000 0.500000 0.434783
50% 0.000000 0.093087 0.186441 0.106202 0.154237 0.254491 0.589601 0.894202 0.765185 0.517564 0.200000 0.500000 0.500000 0.521739
75% 0.250000 0.144422 0.305085 0.149739 0.257627 0.390830 0.751077 0.952243 0.893882 0.737673 0.400000 0.500000 0.500000 0.652174
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [41]:
from scipy.stats import boxcox

plt.figure(figsize=(12,5))
plt.title("Distribution of AMT_CREDIT")
ax = sns.distplot((df["AMT_CREDIT"]))
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/1739910397.py:5: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


Feature Crossing¶

In [42]:
from sklearn.preprocessing import PolynomialFeatures
df_num = df[num_cols]

poly = PolynomialFeatures(interaction_only=True, include_bias=False)
crossed = poly.fit_transform(df_num)

# create feature names manually
feature_names = poly.get_feature_names_out(df_num.columns)
feature_names = [name.replace(' ', '*') for name in feature_names]

# print the crossed features and their names
df_crossing = pd.DataFrame(crossed, columns=feature_names)
In [43]:
df_crossing.head()
Out[43]:
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH ... DAYS_ID_PUBLISH*CNT_FAM_MEMBERS DAYS_ID_PUBLISH*REGION_RATING_CLIENT DAYS_ID_PUBLISH*REGION_RATING_CLIENT_W_CITY DAYS_ID_PUBLISH*HOUR_APPR_PROCESS_START CNT_FAM_MEMBERS*REGION_RATING_CLIENT CNT_FAM_MEMBERS*REGION_RATING_CLIENT_W_CITY CNT_FAM_MEMBERS*HOUR_APPR_PROCESS_START REGION_RATING_CLIENT*REGION_RATING_CLIENT_W_CITY REGION_RATING_CLIENT*HOUR_APPR_PROCESS_START REGION_RATING_CLIENT_W_CITY*HOUR_APPR_PROCESS_START
0 0.00 0.315537 0.375729 0.171052 0.322034 0.193553 0.490779 0.431077 0.760757 0.979053 ... 0.195811 0.489526 0.489526 0.297973 0.1 0.1 0.060870 0.25 0.152174 0.152174
1 0.50 0.058864 0.224136 0.128588 0.152542 0.489878 0.851422 0.799697 0.876169 0.517403 ... 0.310442 0.258701 0.258701 0.269949 0.3 0.3 0.313043 0.25 0.260870 0.260870
2 0.25 0.058864 0.101356 0.074404 0.067797 0.080195 0.960873 0.964608 0.850582 0.870770 ... 0.348308 0.870770 0.870770 0.189298 0.4 0.4 0.086957 1.00 0.217391 0.217391
3 0.50 0.058864 0.066873 0.068231 0.059322 0.076068 0.817639 0.883162 0.725603 0.882050 ... 0.529230 0.441025 0.441025 0.383500 0.3 0.3 0.260870 0.25 0.217391 0.217391
4 0.00 0.134155 0.340420 0.113347 0.281356 0.112428 0.526400 0.954262 0.452299 0.365292 ... 0.073058 0.182646 0.182646 0.174705 0.1 0.1 0.095652 0.25 0.239130 0.239130

5 rows × 105 columns

In [44]:
df_crossing.columns
Out[44]:
Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH',
       ...
       'DAYS_ID_PUBLISH*CNT_FAM_MEMBERS',
       'DAYS_ID_PUBLISH*REGION_RATING_CLIENT',
       'DAYS_ID_PUBLISH*REGION_RATING_CLIENT_W_CITY',
       'DAYS_ID_PUBLISH*HOUR_APPR_PROCESS_START',
       'CNT_FAM_MEMBERS*REGION_RATING_CLIENT',
       'CNT_FAM_MEMBERS*REGION_RATING_CLIENT_W_CITY',
       'CNT_FAM_MEMBERS*HOUR_APPR_PROCESS_START',
       'REGION_RATING_CLIENT*REGION_RATING_CLIENT_W_CITY',
       'REGION_RATING_CLIENT*HOUR_APPR_PROCESS_START',
       'REGION_RATING_CLIENT_W_CITY*HOUR_APPR_PROCESS_START'],
      dtype='object', length=105)
In [45]:
df_crossing["AMT_CREDIT*AMT_ANNUITY"]
Out[45]:
0       0.064269
1       0.028821
2       0.007541
3       0.004563
4       0.038585
          ...   
4358    0.015585
4359    0.124789
4360    0.049024
4361    0.001447
4362    0.006783
Name: AMT_CREDIT*AMT_ANNUITY, Length: 4363, dtype: float64
In [46]:
sns.scatterplot(data=df_crossing, x="AMT_INCOME_TOTAL*AMT_ANNUITY", y="AMT_INCOME_TOTAL")
Out[46]:
<AxesSubplot: xlabel='AMT_INCOME_TOTAL*AMT_ANNUITY', ylabel='AMT_INCOME_TOTAL'>
In [47]:
sns.set(font_scale=0.8)
sns.set_style("whitegrid")
sns.set_palette("husl")
sns.set(rc={"figure.figsize":(12,6)})
sns.heatmap(df_crossing.corr(), annot=False, cmap="coolwarm")
Out[47]:
<AxesSubplot: >

Dimension reduction FAMD¶

In [48]:
import prince

famd = prince.FAMD(n_components=2, n_iter=3,
                   copy=True, check_input=True,
                   engine='sklearn',random_state=42)

famd = famd.fit(df)
coords = famd.row_coordinates(df)
In [49]:
sns.scatterplot(coords, x=0, y=1, hue=df["TARGET"])
Out[49]:
<AxesSubplot: xlabel='0', ylabel='1'>

Outlier Detection: Isolation Tree¶

In [50]:
df.dtypes
Out[50]:
TARGET                          object
NAME_CONTRACT_TYPE              object
CODE_GENDER                     object
FLAG_OWN_CAR                    object
FLAG_OWN_REALTY                 object
CNT_CHILDREN                   float64
AMT_INCOME_TOTAL               float64
AMT_CREDIT                     float64
AMT_ANNUITY                    float64
AMT_GOODS_PRICE                float64
NAME_TYPE_SUITE                 object
NAME_INCOME_TYPE                object
NAME_EDUCATION_TYPE             object
NAME_FAMILY_STATUS              object
NAME_HOUSING_TYPE               object
REGION_POPULATION_RELATIVE     float64
DAYS_BIRTH                     float64
DAYS_EMPLOYED                  float64
DAYS_REGISTRATION              float64
DAYS_ID_PUBLISH                float64
CNT_FAM_MEMBERS                float64
REGION_RATING_CLIENT           float64
REGION_RATING_CLIENT_W_CITY    float64
WEEKDAY_APPR_PROCESS_START      object
HOUR_APPR_PROCESS_START        float64
ORGANIZATION_TYPE               object
dtype: object
In [51]:
df_dummy = pd.get_dummies(df)
In [52]:
from sklearn.ensemble import IsolationForest
dat_iso = df_dummy
model = IsolationForest(n_estimators = 300,max_samples ='auto',contamination = "auto", max_features = 1.0)
model.fit(dat_iso)
scores = model.decision_function(dat_iso)
anomaly = model.predict(dat_iso)
dat_iso['scores'] = scores
dat_iso['anomaly'] = anomaly
anomaly = dat_iso.loc[dat_iso['anomaly'] == -1]
anomaly_index = list(dat_iso.index)
dat_iso_drop = dat_iso[dat_iso.anomaly == 1]
dat_iso_drop.head()
dat_iso_drop.drop(columns=['scores', 'anomaly'], inplace= True)
dat_iso_drop.head()
/var/folders/sv/npxlc_k53696tn8hryg5dx5w0000gn/T/ipykernel_4688/3678961677.py:13: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[52]:
CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH ... ORGANIZATION_TYPE_Trade: type 2 ORGANIZATION_TYPE_Trade: type 3 ORGANIZATION_TYPE_Trade: type 4 ORGANIZATION_TYPE_Trade: type 6 ORGANIZATION_TYPE_Trade: type 7 ORGANIZATION_TYPE_Transport: type 1 ORGANIZATION_TYPE_Transport: type 2 ORGANIZATION_TYPE_Transport: type 3 ORGANIZATION_TYPE_Transport: type 4 ORGANIZATION_TYPE_University
0 0.00 0.315537 0.375729 0.171052 0.322034 0.193553 0.490779 0.431077 0.760757 0.979053 ... 0 0 0 0 0 0 0 0 0 0
1 0.50 0.058864 0.224136 0.128588 0.152542 0.489878 0.851422 0.799697 0.876169 0.517403 ... 0 0 0 0 0 0 0 0 0 0
2 0.25 0.058864 0.101356 0.074404 0.067797 0.080195 0.960873 0.964608 0.850582 0.870770 ... 0 0 0 0 0 0 0 0 0 0
3 0.50 0.058864 0.066873 0.068231 0.059322 0.076068 0.817639 0.883162 0.725603 0.882050 ... 0 0 0 0 0 0 0 0 0 0
5 0.00 0.134155 0.340420 0.113347 0.281356 0.112428 0.526400 0.954262 0.452299 0.365292 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 106 columns

In [53]:
df_dummy.shape
Out[53]:
(4363, 108)
In [54]:
dat_iso_drop.shape
Out[54]:
(4358, 106)
In [55]:
num_vars = df.select_dtypes(include=['float', 'int']).columns.tolist()

# Plot each numerical variable
for var in num_vars:
    fig = px.box(dat_iso_drop, x=var)
    fig.show()